More data and methods for metric learning
library(tidyverse)
library(dimRed)
library(reticulate)
library(here)
library(viridis)
library(hdrcde)
library(igraph)
library(matrixcalc)
library(akima)
library(car)
library(ggforce)
library(ks)
library(patchwork)
Jmisc::sourceAll(here::here("R/sources"))
set.seed(123)Metric learning
Setup
train <- readRDS(here::here("data/spdemand_1id336tow_train.rds"))
# train <- readRDS(here::here("data/spdemand_3639id_notow_201length.rds"))
# Parameters fixed
x <- train
N <- nrow(train)
s <- 2
k <- 20
method <- "annIsomap"
annmethod <- "kdtree"
distance <- "euclidean"
treetype <- "kd"
searchtype <- "radius" # change searchtype for radius search based on `radius`, or KNN search based on `k`
radius <- .4 # the bandwidth parameter, \sqrt(\elsilon), as in algorithmThe metric learning algorithm
metric_isomap <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, invert.h = TRUE)## Warning in matchPars(methodObject, list(...)): Parameter matching: perplexity is
## not a standard parameter, ignoring.
## Warning in matchPars(methodObject, list(...)): Parameter matching: theta is not
## a standard parameter, ignoring.
summary(metric_isomap)## Length Class Mode
## embedding 672 -none- numeric
## rmetric 1344 -none- numeric
## weighted_graph 10 igraph list
## adj_matrix 112896 dgCMatrix S4
## laplacian 112896 dgCMatrix S4
Variable kernel density estimate
For multivariate data, the variable kernel density estimate is given by \[\hat{f}(x) = n^{-1} \sum_i K_{H_i} (x - X_i).\]
Outlier plot based on density estimates
Fixed bandwidth
# fixed bandwidth
fn <- metric_isomap$embedding
E1 <- fn[,1] # rename as Ed to match the aesthetics in plot_ellipse()
E2 <- fn[,2]
prob <- c(1, 50, 99)
p_hdr_isomap <- hdrscatterplot(E1, E2, levels = prob, noutliers = 20, label = NULL)
p_hdr_isomap_p <- p_hdr_isomap +
plot_ellipse(metric_isomap, add = T, n.plot = 50, scale = 20,
color = blues9[5], fill = blues9[5], alpha = 0.2)
# p_hdr_isomap_pPointwise adaptive bandwidth
Rn <- metric_isomap$rmetric # array
n.grid <- 10
f <- vkde2d(x = fn[,1], y = fn[,2], h = Rn, n = n.grid)
plot_contour(metric_isomap, n.grid = n.grid, scale = 1/20)source(here::here("R/sources/hdrplotting.R"))
p_isomap <- plot_outlier(x = metric_isomap, n.grid = 50, prob = prob, scale = 1/8)(p_hdr_isomap_p + p_isomap$p ) + coord_fixed()t-SNE
x <- train
method <- "anntSNE"
perplexity <- round(k / 3) # 30 by default
theta <- 0 # for exact tSNE in the C++ tSNE Barnes-Hut implementation
# tictoc::tic()
metric_tsne <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, perplexity = perplexity, theta = theta, invert.h = TRUE)
# summary(metric_tsne)
# tictoc::toc()ftsne <- metric_tsne$embedding
E1 <- ftsne[,1]; E2 <- ftsne[,2]
# plot_embedding(metric_tsne)
# plot_ellipse(metric_tsne, n.plot = 50)
plot_contour(metric_tsne, n.grid = 50, scale = 1/20)UMAP
x <- train
method <- "annUMAP"
# perplexity <- round(k / 3) # 30 by default
theta <- 0 # for exact tSNE in the C++ tSNE Barnes-Hut implementation
# tictoc::tic()
metric_umap <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, perplexity = perplexity, theta = theta, invert.h = TRUE)
# summary(metric_umap)
# tictoc::toc()fumap <- metric_umap$embedding
E1 <- fumap[,1]; E2 <- fumap[,2]
# plot_embedding(metric_umap)
# plot_ellipse(metric_umap, n.plot = 50)
plot_contour(metric_umap, n.grid = 50, scale = 1/20)Compare outliers from Isomap and t-SNE
To compare two manifold learning methods, we calculate the overlap among outliers. To do so, we first rank the outliers from highest to lowest densities. Then calculate the correlation of ranks for both methods.
To show our methods performs better, we expect a higher correlation of ranks. This shows robustness in finding outliers using different manifold learning methods.
(p_isomap$p + p_tsne$p + p_umap$p) + coord_fixed()Outlier ranking comaprison using variable bandwidth
The ranking of outliers can be compared as follows.
p_isomap_all <- plot_outlier(x = metric_isomap, n.grid = 50, prob = prob, noutliers = N, scale = 1/20)
p_tsne_all <- plot_outlier(x = metric_tsne, n.grid = 50, prob = prob, noutliers =N, scale = 1/20)
p_umap_all <- plot_outlier(x = metric_umap, n.grid = 50, prob = prob, noutliers =N, scale = 1/20)
head(p_isomap_all$outlier, n = 20)## [1] "135" "323" "228" "17" "35" "274" "324" "326" "39" "328" "182" "38"
## [13] "183" "327" "218" "134" "185" "231" "237" "279"
head(p_tsne_all$outlier, n = 20)## [1] "299" "296" "300" "297" "298" "200" "199" "157" "158" "118" "117" "98"
## [13] "116" "99" "188" "189" "186" "187" "100" "248"
head(p_umap_all$outlier, n = 20)## [1] "17" "16" "257" "274" "208" "244" "113" "3" "195" "51" "245" "65"
## [13] "148" "131" "256" "291" "306" "112" "99" "209"
outlier_isomap <- order(as.numeric(p_isomap_all$outlier))
outlier_tsne <- order(as.numeric(p_tsne_all$outlier))
# outlier_umap <- order(as.numeric(p_umap_all$outlier))
# outliers <- cbind(outlier_isomap, outlier_tsne)
cor(outlier_isomap, outlier_tsne)## [1] -0.06160876
# ggcorrplot::ggcorrplot(r, hc.order = TRUE, type = "lower", lab = TRUE)
topconfects::rank_rank_plot(outlier_isomap, outlier_tsne, n=50)library(topconfects)
a <- sample(letters)
b <- sample(letters)
rank_rank_plot(a,b, n=20)Outlier ranking comaprison using fixed bandwidth
head(p_isomap$outlier, n = 20)## [1] "135" "326" "38" "182" "39" "134" "324" "37" "328" "327" "276" "183"
## [13] "181" "228" "278" "237" "133" "323" "86" "231"
head(p_hdr_tsne$outlier, n = 20)## [1] "157" "296" "299" "158" "300" "297" "252" "237" "303" "298" "118" "130"
## [13] "246" "117" "98" "302" "322" "253" "179" "254"
hdroutlier_isomap <- order(as.numeric(p_isomap$outlier))
hdroutlier_tsne <- order(as.numeric(p_hdr_tsne$outlier))
# hdroutliers <- cbind(hdroutlier_isomap, hdroutlier_tsne)
cor(hdroutlier_isomap, hdroutlier_tsne)## [1] 0.1925199
cor(hdroutlier_isomap %>% head(50), hdroutlier_tsne %>% head(50))## [1] 0.2977834
topconfects::rank_rank_plot(hdroutlier_isomap, hdroutlier_tsne, n=50)Compare high density regions
Similarly, we compare the highest density regions from both methods and expect a similar output in detecting most typical observations.
Variable bandwidth
tail(p_isomap_all$outlier, n = 20)## [1] "166" "78" "66" "25" "262" "176" "223" "125" "23" "20" "311" "261"
## [13] "121" "77" "222" "67" "175" "167" "260" "163"
tail(p_tsne_all$outlier, n = 20)## [1] "277" "138" "88" "37" "140" "42" "84" "326" "43" "137" "276" "36"
## [13] "86" "41" "89" "85" "44" "181" "90" "139"
cor(outlier_isomap %>% tail(50), outlier_tsne %>% tail(50))## [1] -0.0488914
Fixed bandwidth
tail(p_isomap$outlier, n = 20)## [1] "303" "167" "163" "78" "260" "118" "166" "67" "254" "164" "31" "261"
## [13] "158" "175" "223" "32" "222" "20" "176" "23"
tail(p_hdr_tsne$outlier, n = 20)## [1] "263" "317" "126" "310" "262" "261" "80" "89" "319" "139" "312" "168"
## [13] "318" "24" "268" "269" "30" "81" "90" "211"
cor(hdroutlier_isomap %>% tail(50), hdroutlier_tsne %>% tail(50))## [1] 0.3175251